Skip to content

Commit 33a27c9

Browse files
committed
Future Implementations
Signed-off-by: Namrata Gachchi <ngachchi@nvidia.com>
1 parent ac07488 commit 33a27c9

File tree

14 files changed

+243
-29
lines changed

14 files changed

+243
-29
lines changed

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ pipeline {
2727
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
2828
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
2929
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
30-
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0'
30+
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-06-25-0'
3131
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
3232
}
3333
stages {

nemo_text_processing/text_normalization/hi/data/measure/quarterly_units.tsv

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@ h घंटे
44
min मिनट
55
doz दर्जन
66
yr साल
7-
yr वर्ष
87
hp हॉर्सपॉवर
98
d दिन
109
month महीना
1110
months महीने
12-
हफ़्ते हफ़्ते
11+
हफ़्ते
12+
सप्ताह
13+
सदियां
14+
सदियों

nemo_text_processing/text_normalization/hi/data/measure/unit.tsv

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,6 @@ KHz किलोहर्ट्ज़
134134
N न्यूटन
135135
dB डेसीबल
136136
yr साल
137-
yr वर्ष
138137
hp हॉर्सपॉवर
139138
d दिन
140139
month महीना
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
एक
2+
दो
3+
तीन
4+
चार
5+
पाँच
6+
छह
7+
सात
8+
आठ
9+
नौ
10+
दस
11+
१० ग्यारह
12+
११ बारह
13+
१२ तेरह
14+
१३ चौदह
15+
१४ पंद्रह
16+
१५ सोलह
17+
१६ सत्रह
18+
१७ अठारह
19+
१८ उन्नीस
20+
१९ बीस
21+
२० इक्कीस
22+
२१ बाईस
23+
२२ तेईस
24+
२३ चौबीस
25+
२४ पच्चीस
26+
२५ छब्बीस
27+
२६ सत्ताईस
28+
२७ अट्ठाईस
29+
२८ उनतीस
30+
२९ तीस
31+
३० इकतीस
32+
३१ बत्तीस
33+
३२ तैंतीस
34+
३३ चौंतीस
35+
३४ पैंतीस
36+
३५ छत्तीस
37+
३६ सैंतीस
38+
३७ अड़तीस
39+
३८ उनतालीस
40+
३९ चालीस
41+
४० इकतालीस
42+
४१ बयालीस
43+
४२ तैंतालीस
44+
४३ चौवालीस
45+
४४ पैंतालीस
46+
४५ छियालीस
47+
४६ सैंतालीस
48+
४७ अड़तालीस
49+
४८ उनचास
50+
४९ पचास
51+
५० इक्यावन
52+
५१ बावन
53+
५२ तिरेपन
54+
५३ चौवन
55+
५४ पचपन
56+
५५ छप्पन
57+
५६ सत्तावन
58+
५७ अट्ठावन
59+
५८ उनसठ
60+
५९ साठ
61+
६० इकसठ
62+
६१ बासठ
63+
६२ तिरेसठ
64+
६३ चौंसठ
65+
६४ पैंसठ
66+
६५ छियासठ
67+
६६ सड़सठ
68+
६७ अड़सठ
69+
६८ उनहत्तर
70+
६९ सत्तर
71+
७० इकहत्तर
72+
७१ बहत्तर
73+
७२ तिहत्तर
74+
७३ चौहत्तर
75+
७४ पचहत्तर
76+
७५ छिहत्तर
77+
७६ सतहत्तर
78+
७७ अठहत्तर
79+
७८ उनासी
80+
७९ अस्सी
81+
८० इक्यासी
82+
८१ बयासी
83+
८२ तिरासी
84+
८३ चौरासी
85+
८४ पचासी
86+
८५ छियासी
87+
८६ सत्तासी
88+
८७ अट्ठासी
89+
८८ नवासी
90+
८९ नब्बे
91+
९० इक्यानबे
92+
९१ बानबे
93+
९२ तिरानबे
94+
९३ चौरानबे
95+
९४ पंचानबे
96+
९५ छियानबे
97+
९६ सत्तानबे
98+
९७ अट्ठानबे
99+
९८ निन्यानबे
100+
९९ एक सौ

nemo_text_processing/text_normalization/hi/taggers/cardinal.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,12 @@
2121

2222
class CardinalFst(GraphFst):
2323
"""
24-
Finite state transducer for classifying cardinals, e.g.
25-
-२३ -> cardinal { negative: "true" integer: "तेइस" } }
26-
s
27-
Args:
28-
deterministic: if True will provide a single transduction option,
29-
for False multiple transduction are generated (used for audio-based normalization)
24+
Finite state transducer for classifying cardinals, e.g.
25+
-२३ -> cardinal { negative: "true" integer: "तेइस" }
26+
27+
Args:
28+
deterministic: if True will provide a single transduction option,
29+
for False multiple transduction are generated (used for audio-based normalization)
3030
"""
3131

3232
def __init__(self, deterministic: bool = True, lm: bool = False):
@@ -37,6 +37,10 @@ def __init__(self, deterministic: bool = True, lm: bool = False):
3737
teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv"))
3838
teens_and_ties = pynutil.add_weight(teens_ties, -0.1)
3939

40+
self.digit = digit
41+
self.zero = zero
42+
self.teens_and_ties = teens_and_ties
43+
4044
def create_graph_suffix(digit_graph, suffix, zeros_counts):
4145
zero = pynutil.add_weight(pynutil.delete("०"), -0.1)
4246
if zeros_counts == 0:

nemo_text_processing/text_normalization/hi/taggers/decimal.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,7 @@ class DecimalFst(GraphFst):
5858
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
5959
super().__init__(name="decimal", kind="classify", deterministic=deterministic)
6060

61-
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
62-
graph_digit |= pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
63-
61+
graph_digit = cardinal.digit | cardinal.zero
6462
cardinal_graph = cardinal.final_graph
6563

6664
self.graph = graph_digit + pynini.closure(insert_space + graph_digit).optimize()

nemo_text_processing/text_normalization/hi/taggers/fraction.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from pynini.lib import pynutil
1717

1818
from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst
19+
from nemo_text_processing.text_normalization.hi.utils import get_abs_path
1920

2021

2122
class FractionFst(GraphFst):
@@ -47,13 +48,43 @@ def __init__(self, cardinal, deterministic: bool = True):
4748
)
4849
self.denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"")
4950

50-
self.graph = (
51+
dedh_dhai_graph = pynini.string_map([("१ १/२", "डेढ़"), ("२ १/२", "ढाई")])
52+
53+
savva_numbers = cardinal_graph + pynini.cross(" १/४", "")
54+
savva_graph = pynutil.insert("सवा ") + savva_numbers
55+
56+
sadhe_numbers = cardinal_graph + pynini.cross(" १/२", "")
57+
sadhe_graph = pynutil.insert("साढ़े ") + sadhe_numbers
58+
59+
paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv"))
60+
paune_numbers = paune + pynini.cross(" ३/४", "")
61+
paune_graph = pynutil.insert("पौने ") + paune_numbers
62+
63+
graph_dedh_dhai = pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph + pynutil.insert("\" ")
64+
65+
graph_savva = pynutil.insert("morphosyntactic_features: \"") + savva_graph + pynutil.insert("\" ")
66+
67+
graph_sadhe = pynutil.insert("morphosyntactic_features: \"") + sadhe_graph + pynutil.insert("\" ")
68+
69+
graph_paune = pynutil.insert("morphosyntactic_features: \"") + paune_graph + pynutil.insert("\" ")
70+
71+
final_graph = (
5172
self.optional_graph_negative
5273
+ pynini.closure(self.integer + pynini.accep(" "), 0, 1)
5374
+ self.numerator
5475
+ self.denominator
5576
)
5677

78+
weighted_graph = (
79+
final_graph
80+
| pynutil.add_weight(graph_dedh_dhai, -0.2)
81+
| pynutil.add_weight(graph_savva, -0.1)
82+
| pynutil.add_weight(graph_sadhe, -0.1)
83+
| pynutil.add_weight(graph_paune, -0.2)
84+
)
85+
86+
self.graph = weighted_graph
87+
5788
graph = self.graph
58-
final_graph = self.add_tokens(graph)
59-
self.fst = final_graph.optimize()
89+
graph = self.add_tokens(graph)
90+
self.fst = graph.optimize()

nemo_text_processing/text_normalization/hi/taggers/measure.py

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
4141
super().__init__(name="measure", kind="classify")
4242

4343
cardinal_graph = (
44-
digit
45-
| teens_and_ties
44+
cardinal.zero
45+
| cardinal.digit
46+
| cardinal.teens_and_ties
4647
| cardinal.graph_hundreds
4748
| cardinal.graph_thousands
4849
| cardinal.graph_ten_thousands
@@ -52,6 +53,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
5253
point = pynutil.delete(".")
5354
decimal_integers = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"")
5455
decimal_graph = decimal_integers + point + insert_space + decimal.graph_fractional
56+
5557
unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv"))
5658
quarterly_units_graph = pynini.string_file(get_abs_path("data/measure/quarterly_units.tsv"))
5759

@@ -93,10 +95,50 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
9395
+ unit
9496
)
9597

96-
graph_quarter = (
98+
dedh_dhai = pynini.string_map([("१.५", "डेढ़"), ("२.५", "ढाई")])
99+
dedh_dhai_graph = pynutil.insert("integer: \"") + dedh_dhai + pynutil.insert("\"")
100+
101+
savva_numbers = cardinal_graph + pynini.cross(".२५", "")
102+
savva_graph = pynutil.insert("integer: \"सवा ") + savva_numbers + pynutil.insert("\"")
103+
104+
sadhe_numbers = cardinal_graph + pynini.cross(".५", "")
105+
sadhe_graph = pynutil.insert("integer: \"साढ़े ") + sadhe_numbers + pynutil.insert("\"")
106+
107+
paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv"))
108+
paune_numbers = paune + pynini.cross(".७५", "")
109+
paune_graph = pynutil.insert("integer: \"पौने ") + paune_numbers + pynutil.insert("\"")
110+
111+
graph_dedh_dhai = (
112+
pynutil.insert("cardinal { ")
113+
+ optional_graph_negative
114+
+ dedh_dhai_graph
115+
+ pynutil.insert(" }")
116+
+ delete_space
117+
+ units
118+
)
119+
120+
graph_savva = (
121+
pynutil.insert("cardinal { ")
122+
+ optional_graph_negative
123+
+ savva_graph
124+
+ pynutil.insert(" }")
125+
+ delete_space
126+
+ units
127+
)
128+
129+
graph_sadhe = (
130+
pynutil.insert("cardinal { ")
131+
+ optional_graph_negative
132+
+ sadhe_graph
133+
+ pynutil.insert(" }")
134+
+ delete_space
135+
+ units
136+
)
137+
138+
graph_paune = (
97139
pynutil.insert("cardinal { ")
98140
+ optional_graph_negative
99-
+ quarter_graph
141+
+ paune_graph
100142
+ pynutil.insert(" }")
101143
+ delete_space
102144
+ units
@@ -135,9 +177,12 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
135177

136178
graph = (
137179
pynutil.add_weight(graph_decimal, 0.01)
138-
| pynutil.add_weight(graph_quarter, 0.005)
139180
| pynutil.add_weight(graph_cardinal, 0.01)
140181
| pynutil.add_weight(graph_exceptions, 0.01)
182+
| pynutil.add_weight(graph_dedh_dhai, 0.001)
183+
| pynutil.add_weight(graph_savva, 0.005)
184+
| pynutil.add_weight(graph_sadhe, 0.005)
185+
| pynutil.add_weight(graph_paune, -0.2)
141186
)
142187
self.graph = graph.optimize()
143188

nemo_text_processing/text_normalization/hi/taggers/time.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,11 @@ class TimeFst(GraphFst):
3636
for False multiple transduction are generated (used for audio-based normalization)
3737
"""
3838

39-
def __init__(self):
39+
def __init__(self, cardinal: GraphFst):
4040
super().__init__(name="time", kind="classify")
4141

4242
delete_colon = pynutil.delete(":")
43+
cardinal_graph = cardinal.digit | cardinal.teens_and_ties
4344

4445
self.hours = pynutil.insert("hours: \"") + hours_graph + pynutil.insert("\" ")
4546
self.minutes = pynutil.insert("minutes: \"") + minutes_graph + pynutil.insert("\" ")
@@ -56,7 +57,35 @@ def __init__(self):
5657
# hour
5758
graph_h = self.hours + delete_colon + pynutil.delete("००")
5859

59-
final_graph = graph_hms | graph_hm | graph_h
60+
dedh_dhai_graph = pynini.string_map([("१:३०", "डेढ़"), ("२:३०", "ढाई")])
61+
62+
savva_numbers = cardinal_graph + pynini.cross(":१५", "")
63+
savva_graph = pynutil.insert("सवा ") + savva_numbers
64+
65+
sadhe_numbers = cardinal_graph + pynini.cross(":३०", "")
66+
sadhe_graph = pynutil.insert("साढ़े ") + sadhe_numbers
67+
68+
paune = pynini.string_file(get_abs_path("data/whitelist/paune_mappings.tsv"))
69+
paune_numbers = paune + pynini.cross(":४५", "")
70+
paune_graph = pynutil.insert("पौने ") + paune_numbers
71+
72+
graph_dedh_dhai = pynutil.insert("morphosyntactic_features: \"") + dedh_dhai_graph + pynutil.insert("\" ")
73+
74+
graph_savva = pynutil.insert("morphosyntactic_features: \"") + savva_graph + pynutil.insert("\" ")
75+
76+
graph_sadhe = pynutil.insert("morphosyntactic_features: \"") + sadhe_graph + pynutil.insert("\" ")
77+
78+
graph_paune = pynutil.insert("morphosyntactic_features: \"") + paune_graph + pynutil.insert("\" ")
79+
80+
final_graph = (
81+
graph_hms
82+
| pynutil.add_weight(graph_hm, 0.01)
83+
| pynutil.add_weight(graph_h, 0.01)
84+
| pynutil.add_weight(graph_dedh_dhai, 0.001)
85+
| pynutil.add_weight(graph_savva, 0.005)
86+
| pynutil.add_weight(graph_sadhe, 0.005)
87+
| pynutil.add_weight(graph_paune, 0.001)
88+
)
6089

6190
final_graph = self.add_tokens(final_graph)
6291
self.fst = final_graph.optimize()

nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def __init__(
9898
logging.debug(f"date: {time.time() - start_time: .2f}s -- {date_graph.num_states()} nodes")
9999

100100
start_time = time.time()
101-
timefst = TimeFst()
101+
timefst = TimeFst(cardinal=cardinal)
102102
time_graph = timefst.fst
103103
logging.debug(f"time: {time.time() - start_time: .2f}s -- {time_graph.num_states()} nodes")
104104

0 commit comments

Comments
 (0)