Skip to content

Commit fbbbee9

Browse files
committed
Implements aliases for common string literals
Signed-off-by: Simon Zuberek <szuberek@nvidia.com>
1 parent 5ec3f59 commit fbbbee9

File tree

4 files changed

+302
-57
lines changed

4 files changed

+302
-57
lines changed

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ pipeline {
1717
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-27-23-0'
1818
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-13-23-2'
1919
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
20-
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-14-24-0'
20+
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
2121
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
2222
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
2323
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'

nemo_text_processing/text_normalization/en/graph_utils.py

Lines changed: 82 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@
3535
NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize()
3636
NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
3737
NEMO_HEX = pynini.union(*string.hexdigits).optimize()
38-
NEMO_NON_BREAKING_SPACE = u"\u00A0"
38+
NEMO_NON_BREAKING_SPACE = "\u00A0"
3939
NEMO_SPACE = " "
40-
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
40+
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
4141
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
4242
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
4343

@@ -79,25 +79,70 @@
7979
delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
8080
delete_preserve_order = pynini.closure(
8181
pynutil.delete(" preserve_order: true")
82-
| (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\""))
82+
| (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"'))
8383
)
8484

85+
# Common string literals; expand as you see fit.
86+
username_string = "username"
87+
double_quotes = '"'
88+
domain_string = "domain"
89+
protocol_string = "protocol"
90+
slash = "/"
91+
double_slash = "//"
92+
triple_slash = "///"
93+
file = "file"
94+
period = "."
95+
at = "@"
96+
colon = ":"
97+
https = "https"
98+
http = "http"
99+
www = "www"
100+
85101
suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
86102
# _v = pynini.union("a", "e", "i", "o", "u")
87103
_c = pynini.union(
88-
"b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"
104+
"b",
105+
"c",
106+
"d",
107+
"f",
108+
"g",
109+
"h",
110+
"j",
111+
"k",
112+
"l",
113+
"m",
114+
"n",
115+
"p",
116+
"q",
117+
"r",
118+
"s",
119+
"t",
120+
"v",
121+
"w",
122+
"x",
123+
"y",
124+
"z",
89125
)
90126
_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies")
91127
_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
92128
_s = NEMO_SIGMA + pynutil.insert("s")
93129

94130
graph_plural = plurals._priority_union(
95-
suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA
131+
suppletive,
132+
plurals._priority_union(
133+
_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA
134+
),
135+
NEMO_SIGMA,
96136
).optimize()
97137

98138
SINGULAR_TO_PLURAL = graph_plural
99139
PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
100-
TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)])
140+
TO_LOWER = pynini.union(
141+
*[
142+
pynini.cross(x, y)
143+
for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)
144+
]
145+
)
101146
TO_UPPER = pynini.invert(TO_LOWER)
102147
MIN_NEG_WEIGHT = -0.0001
103148
MIN_POS_WEIGHT = 0.0001
@@ -107,8 +152,10 @@
107152

108153

109154
def capitalized_input_graph(
110-
graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None
111-
) -> 'pynini.FstLike':
155+
graph: "pynini.FstLike",
156+
original_graph_weight: float = None,
157+
capitalized_graph_weight: float = None,
158+
) -> "pynini.FstLike":
112159
"""
113160
Allow graph input to be capitalized, e.g. for ITN)
114161
@@ -123,13 +170,15 @@ def capitalized_input_graph(
123170
graph = pynutil.add_weight(graph, weight=original_graph_weight)
124171

125172
if capitalized_graph_weight is not None:
126-
capitalized_graph = pynutil.add_weight(capitalized_graph, weight=capitalized_graph_weight)
173+
capitalized_graph = pynutil.add_weight(
174+
capitalized_graph, weight=capitalized_graph_weight
175+
)
127176

128177
graph |= capitalized_graph
129178
return graph
130179

131180

132-
def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
181+
def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
133182
"""
134183
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
135184
@@ -141,7 +190,7 @@ def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
141190
for rule, graph in graphs.items():
142191
exporter[rule] = graph.optimize()
143192
exporter.close()
144-
logger.info(f'Created {file_name}')
193+
logger.info(f"Created {file_name}")
145194

146195

147196
def get_plurals(fst):
@@ -168,7 +217,7 @@ def get_singulars(fst):
168217
return PLURAL_TO_SINGULAR @ fst
169218

170219

171-
def convert_space(fst) -> 'pynini.FstLike':
220+
def convert_space(fst) -> "pynini.FstLike":
172221
"""
173222
Converts space to nonbreaking space.
174223
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
@@ -179,7 +228,9 @@ def convert_space(fst) -> 'pynini.FstLike':
179228
180229
Returns output fst where breaking spaces are converted to non breaking spaces
181230
"""
182-
return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA)
231+
return fst @ pynini.cdrewrite(
232+
pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA
233+
)
183234

184235

185236
def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
@@ -191,7 +242,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
191242
written_capitalized = written[0].upper() + written[1:]
192243
additional_labels.extend(
193244
[
194-
[written_capitalized, spoken.capitalize()], # first letter capitalized
245+
[
246+
written_capitalized,
247+
spoken.capitalize(),
248+
], # first letter capitalized
195249
[
196250
written_capitalized,
197251
spoken.upper().replace(" AND ", " and "),
@@ -205,7 +259,10 @@ def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED):
205259
logger.debug(f"This is weight {weight}")
206260
if len(weight) == 0:
207261
additional_labels.extend(
208-
[[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]]
262+
[
263+
[written, spoken_no_space],
264+
[written_capitalized, spoken_no_space.upper()],
265+
]
209266
)
210267
else:
211268
additional_labels.extend(
@@ -237,9 +294,13 @@ def __init__(self, name: str, kind: str, deterministic: bool = True):
237294
self._fst = None
238295
self.deterministic = deterministic
239296

240-
self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far')
297+
self.far_path = Path(
298+
os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far"
299+
)
241300
if self.far_exist():
242-
self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()
301+
self._fst = Far(
302+
self.far_path, mode="r", arc_type="standard", far_type="default"
303+
).get_fst()
243304

244305
def far_exist(self) -> bool:
245306
"""
@@ -248,14 +309,14 @@ def far_exist(self) -> bool:
248309
return self.far_path.exists()
249310

250311
@property
251-
def fst(self) -> 'pynini.FstLike':
312+
def fst(self) -> "pynini.FstLike":
252313
return self._fst
253314

254315
@fst.setter
255316
def fst(self, fst):
256317
self._fst = fst
257318

258-
def add_tokens(self, fst) -> 'pynini.FstLike':
319+
def add_tokens(self, fst) -> "pynini.FstLike":
259320
"""
260321
Wraps class name around to given fst
261322
@@ -267,7 +328,7 @@ def add_tokens(self, fst) -> 'pynini.FstLike':
267328
"""
268329
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
269330

270-
def delete_tokens(self, fst) -> 'pynini.FstLike':
331+
def delete_tokens(self, fst) -> "pynini.FstLike":
271332
"""
272333
Deletes class name wrap around output of given fst
273334
@@ -286,4 +347,4 @@ def delete_tokens(self, fst) -> 'pynini.FstLike':
286347
+ delete_space
287348
+ pynutil.delete("}")
288349
)
289-
return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", NEMO_SIGMA)
350+
return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", NEMO_SIGMA)

0 commit comments

Comments
 (0)